Proteomics: data analysis

Prerequisites

Load the packages needed for the analyses and read-in the relavant data

library(scales)
library(data.table)
library(vctrs)
library(tidyverse)
library(plotly)
library(TissueEnrich)
library(ggrepel)
library(RColorBrewer)
library(pheatmap)
setwd("/work/LAS/geetu-lab/arnstrm/mouse.trophoblast.smallRNAseq")
dat <-
  read.csv("assets/data_processed.csv",
           row.names = NULL,
           stringsAsFactors = TRUE)
orig_data <-
  read.csv("assets/orig_data.csv",
           row.names = NULL,
           stringsAsFactors = TRUE)
source("assets/theme_clean.R")

quick check:

dat_tidy <- dat %>% drop_na()
dat %>% summarise(Number_of_proteins = n())
#>   Number_of_proteins
#> 1                174

Volcano plot

df <- orig_data
exp <- log(1.5, 2)
df$log2fc <- log(df$Ratio, base = 2)
df$negLog10p = -log10(df$Qvalue)
df$diffexpressed[df$log2fc <= -exp &
                   df$Qvalue <= 0.05] <- "up in Diff"
df$diffexpressed[df$log2fc >= exp &
                   df$Qvalue <= 0.05] <- "up in Undiff"
df$diffexpressed[df$log2fc >= -exp &
                   df$log2fc <= exp] <- "other genes"
df$diffexpressed[df$Qvalue > 0.05] <- "other genes"
df$newGenes <-
  str_replace_all(string = df$Genes,
                  pattern = "\\;.*$",
                  replacement = "")
data.table::setDT(df)[diffexpressed == "other genes", newGenes := NA]

g <-
  ggplot(data = df,
         aes(
           x = log2fc,
           y = negLog10p,
           col = diffexpressed,
           label = newGenes
         )) +
  geom_point(alpha = 0.5) +
  theme_classic() +
  scale_color_manual(name = "Expression",
                     values = c("grey", "#c6007b", "#a0b600")) +
  ggtitle(paste("Diff vs. Undiff (proteomics)")) +
  #  geom_text_repel(show.legend = F) +
  xlab("Log2 fold change") +
  ylab("-log10 pvalue") +
  theme(legend.text.align = 0)
ggplotly(g)

Figure 1: Volcano plot (proteomics)

#ggsave("prot_volc.png", dpi=900, width = 8, height = 6)

PCA plot

This was run on another R version due to incompatibility with existing packages. The command is shown below, but it is not executed when knitting the document.

library("MetaboAnalystR")
library(ggplot2)
library(ggrepel)

setwd("~/metabo")
mSet <- InitDataObjects("conc", "stat", FALSE)
mSet <- Read.TextData(mSet, "data_processed.csv", "colu", "disc")

mSet <- SanityCheckData(mSet)
mSet <- RemoveMissingPercent(mSet, percent = 0.5)
mSet <- ImputeMissingVar(mSet, method = "min")
mSet <- PreparePrenormData(mSet)
mSet <-
  Normalization(mSet,
                "NULL",
                "LogNorm",
                "NULL",
                ratio = FALSE,
                ratioNum = 20)
mSet <-
  PlotSampleNormSummary(mSet, "snorm_0_", "png", 300, width = NA)
mSet <- PCA.Anal(mSet)
saveRDS(mSet, "mSet.rds")

Read in the saved RDS file from the previous step and plot the PCA for samples.

mSet <- readRDS("assets/mset.rds")
group <- sapply(strsplit(rownames(mSet$analSet$pca$x), "_"), "[", 1)
intgroup.df <- as.data.frame(group)
d <- data.frame(
  PC1 = mSet$analSet$pca$x[, 1],
  PC2 = mSet$analSet$pca$x[, 2],
  intgroup.df,
  name = rownames(mSet$analSet$pca$x)
)

g <- ggplot(d, aes(PC1, PC2, color = group)) +
  scale_shape_manual(values = 1:10) +
  scale_color_manual(values = c('Diff'      = '#c6007b',
                                'Undiff'  = '#a0b600')) +
  theme_classic() +
  theme(legend.title = element_blank()) +
  geom_point(size = 2, stroke = 2) +
  geom_text_repel(aes(label = name)) +
  xlab(paste("PC1", round(mSet$analSet$pca$variance[1] * 100, 2), "% variance")) +
  ylab(paste("PC2", round(mSet$analSet$pca$variance[2] * 100, 2), "% variance"))
g
Figure 2: PCA plot (proteomics)

Figure 2: PCA plot (proteomics)

Tissue Enrichment

plotTE <- function(inputGenes = gene.list,
                   myColor = "color") {
  gs <-
    GeneSet(geneIds = inputGenes,
            organism = "Mus Musculus",
            geneIdType = SymbolIdentifier())
  output <- teEnrichment(inputGenes = gs, rnaSeqDataset = 3)
  en.output <-
    setNames(data.frame(assay(output[[1]]),
                        row.names = rowData(output[[1]])[, 1]),
             colData(output[[1]])[, 1])
  en.output$Tissue <- rownames(en.output)
  logp <- -log10(0.05)
  en.output <-
    mutate(en.output,
           significance = ifelse(Log10PValue > logp,
                                 "colored", "nocolor"))
  en.output$Sig <- "NA"
  ggplot(en.output, aes(reorder(Tissue, Log10PValue),
                        Log10PValue,
                        fill = significance)) +
    geom_bar(stat = 'identity') +
    theme_clean() + ylab("- log10 adj. p-value") + xlab("") +
    scale_fill_manual(values = c("colored" = myColor, "nocolor" = "gray")) +
    scale_y_continuous(expand = expansion(mult = c(0, .1)),
                       breaks = scales::pretty_breaks()) +
    coord_flip()
}
exp <- log(1.2, 2)
diff <- df$newGenes[df$log2fc <= -exp & df$Qvalue <= 0.05]
undiff <- df$newGenes[df$log2fc >= exp & df$Qvalue <= 0.05]
df.diff <- dat_tidy %>%
  rowwise() %>%
  mutate(Diff = mean(c(
    Diff_rep1,   Diff_rep2,  Diff_rep3,  Diff_rep4,  Diff_rep5
  ))) %>%
  dplyr::select(proteinID, Diff) %>%
  dplyr::filter(Diff > 0)  %>%
  ungroup() %>%
  mutate(quart = ntile(Diff, 4)) %>%
  mutate(decile = ntile(Diff, 10))
df.undiff <- dat_tidy %>%
  rowwise() %>%
  mutate(Undiff = mean(
    c(
      Undiff_rep1,
      Undiff_rep2,
      Undiff_rep3,
      Undiff_rep4,
      Undiff_rep5
    )
  )) %>%
  dplyr::select(proteinID, Undiff) %>%
  dplyr::filter(Undiff > 0)  %>%
  ungroup() %>%
  mutate(quart = ntile(Undiff, 4)) %>%
  mutate(decile = ntile(Undiff, 10))

filterCuts <- function(dataIn = df.undiff,
                       cutOff = 4,
                       type = decile) {
  type <- enquo(type)
  dataIn %>%
    dplyr::filter(!!type == cutOff) %>%
    dplyr::select(proteinID)
}
undiff.75pc <- filterCuts(df.undiff, 4, type = quart)
diff.75pc <- filterCuts(df.diff, 4, type = quart)
undiff.90pc <- filterCuts(df.undiff, 10, type = decile)
diff.90pc <- filterCuts(df.diff, 10, type = decile)
heat_colors <- brewer.pal(9, "YlOrRd")
plotTE.heatmap.full <-
  function(inputGenes = gene.list,
           inputTissue = "E14.5-Brain",
           GeneNames = FALSE) {
    gs <-
      GeneSet(geneIds = inputGenes,
              organism = "Mus Musculus",
              geneIdType = SymbolIdentifier())
    output <- teEnrichment(inputGenes = gs, rnaSeqDataset = 3)
    en.output <-
      setNames(data.frame(assay(output[[1]]),
                          row.names = rowData(output[[1]])[, 1]),
               colData(output[[1]])[, 1])
    en.output$Tissue <- rownames(en.output)
    seExp <- output[[2]][[inputTissue]]
    exp <-
      setNames(data.frame(assay(seExp), row.names = rowData(seExp)[, 1]),
               colData(seExp)[, 1])
    g <- pheatmap(
      exp,
      color = heat_colors,
      cluster_rows = F,
      cluster_cols  = T,
      show_rownames = GeneNames,
      border_color = NA,
      fontsize = 10,
      scale = "row",
      fontsize_row = 8
    )
    g
  }

Enrichment plots (TissueEnrich)

Diff DE

plotTE(unique(diff), myColor = "#c6007b")
Figure 3A: Differentially expressed proteins (up-regulated in Diff) enrichment in Mouse ENCODE data

Figure 3A: Differentially expressed proteins (up-regulated in Diff) enrichment in Mouse ENCODE data

Undiff DE

plotTE(unique(undiff), myColor = "#a0b600")
Figure 3B: Differentially expressed proteins (up-regulated in Undiff) enrichment in Mouse ENCODE data

Figure 3B: Differentially expressed proteins (up-regulated in Undiff) enrichment in Mouse ENCODE data

Diff 75th pc

plotTE(as.character(diff.75pc$proteinID), "#c6007b")
Figure 3C: Top quartile proteins (Diff) enrichment in Mouse ENCODE data

Figure 3C: Top quartile proteins (Diff) enrichment in Mouse ENCODE data

Diff 90th pc

plotTE(as.character(diff.90pc$proteinID), "#c6007b")
Figure 3D: Top decile proteins (Diff) enrichment in Mouse ENCODE data

Figure 3D: Top decile proteins (Diff) enrichment in Mouse ENCODE data

Undiff 75th pc

plotTE(as.character(undiff.75pc$proteinID), "#a0b600")
Figure 3E: Top quartile proteins (Undiff) enrichment in Mouse ENCODE data

Figure 3E: Top quartile proteins (Undiff) enrichment in Mouse ENCODE data

Undiff 90th pc

plotTE(as.character(undiff.90pc$proteinID), "#a0b600")
Figure 3F: Top decile proteins (Diff) enrichment in Mouse ENCODE data

Figure 3F: Top decile proteins (Diff) enrichment in Mouse ENCODE data

Enrichment Heatmaps for E14.5-Placenta

Diff DE

plotTE.heatmap.full(
  inputGenes = unique(diff),
  inputTissue = "E14.5-Placenta",
  GeneNames = TRUE
)
Figure 4A: Differentially expressed proteins (up-regulated in Diff) enrichment in Mouse ENCODE data

Figure 4A: Differentially expressed proteins (up-regulated in Diff) enrichment in Mouse ENCODE data

Undiff DE

plotTE.heatmap.full(
  inputGenes = unique(undiff),
  inputTissue = "E14.5-Placenta",
  GeneNames = TRUE
)
Figure 4B: Differentially expressed proteins (up-regulated in Undiff) enrichment in Mouse ENCODE data

Figure 4B: Differentially expressed proteins (up-regulated in Undiff) enrichment in Mouse ENCODE data

Session Information

sessionInfo()
#> R version 4.2.1 (2022-06-23)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 20.04.4 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> attached base packages:
#> [1] stats4    stats     graphics  grDevices utils     datasets  methods  
#> [8] base     
#> 
#> other attached packages:
#>  [1] pheatmap_1.0.12             RColorBrewer_1.1-3         
#>  [3] ggrepel_0.9.1               TissueEnrich_1.16.0        
#>  [5] GSEABase_1.58.0             graph_1.74.0               
#>  [7] annotate_1.74.0             XML_3.99-0.10              
#>  [9] AnnotationDbi_1.58.0        SummarizedExperiment_1.26.1
#> [11] Biobase_2.56.0              GenomicRanges_1.48.0       
#> [13] GenomeInfoDb_1.32.2         IRanges_2.30.0             
#> [15] S4Vectors_0.34.0            BiocGenerics_0.42.0        
#> [17] MatrixGenerics_1.8.1        matrixStats_0.62.0         
#> [19] ensurer_1.1                 plotly_4.10.0              
#> [21] forcats_0.5.1               stringr_1.4.0              
#> [23] dplyr_1.0.9                 purrr_0.3.4                
#> [25] readr_2.1.2                 tidyr_1.2.0                
#> [27] tibble_3.1.8                ggplot2_3.3.6              
#> [29] tidyverse_1.3.2             vctrs_0.4.1                
#> [31] data.table_1.14.2           scales_1.2.0               
#> 
#> loaded via a namespace (and not attached):
#>  [1] googledrive_2.0.0      colorspace_2.0-3       ellipsis_0.3.2        
#>  [4] XVector_0.36.0         fs_1.5.2               rstudioapi_0.13       
#>  [7] farver_2.1.1           bit64_4.0.5            fansi_1.0.3           
#> [10] lubridate_1.8.0        xml2_1.3.3             cachem_1.0.6          
#> [13] knitr_1.39             jsonlite_1.8.0         broom_1.0.0           
#> [16] dbplyr_2.2.1           png_0.1-7              compiler_4.2.1        
#> [19] httr_1.4.3             backports_1.4.1        assertthat_0.2.1      
#> [22] Matrix_1.4-1           fastmap_1.1.0          lazyeval_0.2.2        
#> [25] gargle_1.2.0           cli_3.3.0              htmltools_0.5.3       
#> [28] tools_4.2.1            gtable_0.3.0           glue_1.6.2            
#> [31] GenomeInfoDbData_1.2.8 Rcpp_1.0.9             cellranger_1.1.0      
#> [34] jquerylib_0.1.4        Biostrings_2.64.0      crosstalk_1.2.0       
#> [37] xfun_0.31              rvest_1.0.2            lifecycle_1.0.1       
#> [40] googlesheets4_1.0.0    zlibbioc_1.42.0        hms_1.1.1             
#> [43] yaml_2.3.5             memoise_2.0.1          sass_0.4.2            
#> [46] stringi_1.7.8          RSQLite_2.2.15         highr_0.9             
#> [49] rlang_1.0.4            pkgconfig_2.0.3        bitops_1.0-7          
#> [52] evaluate_0.15          lattice_0.20-45        htmlwidgets_1.5.4     
#> [55] labeling_0.4.2         bit_4.0.4              tidyselect_1.1.2      
#> [58] magrittr_2.0.3         bookdown_0.27          R6_2.5.1              
#> [61] generics_0.1.3         DelayedArray_0.22.0    DBI_1.1.3             
#> [64] pillar_1.8.0           haven_2.5.0            withr_2.5.0           
#> [67] KEGGREST_1.36.3        RCurl_1.98-1.8         modelr_0.1.8          
#> [70] crayon_1.5.1           utf8_1.2.2             tzdb_0.3.0            
#> [73] rmarkdown_2.14         grid_4.2.1             readxl_1.4.0          
#> [76] blob_1.2.3             rmdformats_1.0.4       reprex_2.0.1          
#> [79] digest_0.6.29          xtable_1.8-4           munsell_0.5.0         
#> [82] viridisLite_0.4.0      bslib_0.4.0